import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode()
import datetime
dataset = pd.read_csv('petr4.csv')
# transform column Date to a datetime
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.tail()
# including variation
dataset['Variation'] = dataset['Close'].sub(dataset['Open'])
dataset.head()
# Plots price values in the analyzed range from 2010 to 2017
x1=dataset.Date
y1=dataset.Close
data = [go.Scatter(x=x1, y=y1)]
layout = go.Layout(
xaxis=dict(
range=['01-01-2010','11-04-2017'],
title='Year'
),
yaxis=dict(
range=[min(x1), max(y1)],
title='Stock Price'
))
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)
# Visualizing Candlesticks - last 7 days
dataset2 = dataset.head(7)
dados = go.Candlestick(x=dataset2.Date,
open=dataset2.Open,
high=dataset2.High,
low=dataset2.Low,
close=dataset2.Close,
)
data=[dados]
py.offline.iplot(data,filename='graph_candlestick')
# Variation in the period
%matplotlib notebook
import matplotlib.dates as mdates
import datetime as dt
x = dataset['Date']
y = dataset['Variation']
plt.plot_date(x,y, color='r',fmt="r-")
plt.xticks(rotation=30)
plt.show()
training = dataset
# Scatter plot of Open and Close prices from last 100 days
%matplotlib notebook
x = training.Open[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('open price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()
# Scatter plot of High and Close prices from last 100 days
%matplotlib notebook
x = training.High[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('high price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()
# Scatter plot of Low and Close price from last 100 days
%matplotlib notebook
x = training.Low[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('low price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()
# Scatter plot of Volume and Close price from last 100 days
%matplotlib notebook
x = training.Volume[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('Volume')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.ticklabel_format(style='plain', axis='x')
plt.autoscale('False')
plt.xticks(rotation=45)
plt.show()
features = ['Open','High','Low','Volume']
training = training[features]
training.head()
# Class dataset receives target attribute Close
y = dataset['Close']
X_training, X_test, y_training, y_test = train_test_split(
training, y, random_state=42)
# Creates the linear regression model
lr_model = LinearRegression()
# Training the model
lr_model.fit(X_training,y_training)
# Checking the coefficients
lr_model.coef_
# Predicting prices
results = lr_model.predict(X_test)
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):
# Regression metrics
explained_variance=metrics.explained_variance_score(y_true, y_pred)
mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred)
mse=metrics.mean_squared_error(y_true, y_pred)
mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
r2=metrics.r2_score(y_true, y_pred)
print('explained_variance: ', round(explained_variance,4))
print('mean_squared_log_error: ', round(mean_squared_log_error,4))
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))
# Calculate the metrics for the results
regression_results(y_test,results)
%matplotlib notebook
prediction = pd.DataFrame(lr_model.predict(X_test))
actual = pd.DataFrame(y_test.values)
# Graphic style
plt.style.use("ggplot")
# Axis titles
plt.xlabel('Prices')
plt.ylabel('Indexes')
plt.title('Actual Prices vs Prediction')
# Sort values and plot lines
plt.plot(prediction.sort_values(by=0),prediction.index)
plt.plot(actual.sort_values(by=0),actual.index)
# Set graph labels
plt.legend(['Prediction','Actual Price'])